import os
import pickle

import gym
import numpy as np
from tqdm import tqdm
from absl import app, flags
from ml_collections import config_flags
from tensorboardX import SummaryWriter

from dataset_utils import D4RLDataset, reward_from_preference, reward_from_preference_transformer, split_into_trajectories
from evaluation import evaluate

from learner import Learner

# os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '.30'

FLAGS = flags.FLAGS

flags.DEFINE_string('env_name', 'halfcheetah-expert-v2', 'Environment name.')
flags.DEFINE_string('save_dir', './runs/', 'Tensorboard logging dir.')
flags.DEFINE_integer('seed', 0, 'Random seed.')
flags.DEFINE_integer('eval_episodes', 10,
                     'Number of episodes used for evaluation.')
flags.DEFINE_integer('log_interval', 5000, 'Logging interval.')
flags.DEFINE_integer('eval_interval', 5000, 'Eval interval.')
flags.DEFINE_integer('batch_size', 512, 'Mini batch size.')
flags.DEFINE_integer('max_steps', int(1e6), 'Number of training steps.')
flags.DEFINE_boolean('tqdm', True, 'Use tqdm progress bar.')
flags.DEFINE_boolean('use_reward_model', False, 'Use reward model for relabeling reward.')
flags.DEFINE_string('model_type', 'MLP', 'type of reward model.')
flags.DEFINE_string('ckpt_dir',
                    './logs/pref_reward',
                    'ckpt path for reward model.')
flags.DEFINE_string('comment',
                    'base',
                    'comment for distinguishing experiments.')
flags.DEFINE_integer('seq_len', 25, 'sequence length for relabeling reward in Transformer.')
flags.DEFINE_bool('use_diff', False, 'boolean whether use difference in sequence for reward relabeling.')
flags.DEFINE_string('label_mode', 'last', 'mode for relabeling reward with tranformer.')

config_flags.DEFINE_config_file(
    'config',
    'default.py',
    'File path to the training hyperparameter configuration.',
    lock_config=False)


def normalize(dataset, env_name, max_episode_steps=1000):
    trajs = split_into_trajectories(dataset.observations, dataset.actions,
                                    dataset.rewards, dataset.masks,
                                    dataset.dones_float,
                                    dataset.next_observations)
    trj_mapper = []
    for trj_idx, traj in tqdm(enumerate(trajs), total=len(trajs), desc="chunk trajectories"):
        traj_len = len(traj)

        for _ in range(traj_len):
            trj_mapper.append((trj_idx, traj_len))

    def compute_returns(traj):
        episode_return = 0
        for _, _, rew, _, _, _ in traj:
            episode_return += rew

        return episode_return

    sorted_trajs = sorted(trajs, key=compute_returns)
    min_return, max_return = compute_returns(sorted_trajs[0]), compute_returns(sorted_trajs[-1])

    normalized_rewards = []
    for i in range(dataset.size):
        _reward = dataset.rewards[i]
        if 'antmaze' in env_name:
            _, len_trj = trj_mapper[i]
            _reward -= min_return / len_trj
        _reward /= max_return - min_return
        _reward *= max_episode_steps
        normalized_rewards.append(_reward)

    dataset.rewards = np.array(normalized_rewards)


def make_env_and_dataset(env_name: str, seed: int, index) :
    import metaworld
    from gym import wrappers
    dataset_name = env_name.split('_')[1]
    ml1 = metaworld.MT1(dataset_name, seed=1337)  # Construct the benchmark, sampling tasks
    env = ml1.train_classes[dataset_name]()  # Create an environment with task
    env = wrappers.TimeLimit(env, 500)
    env.train_tasks = ml1.train_tasks
    task = ml1.train_tasks[0]
    env.set_task(task)
    env._freeze_rand_vec = False

    dataset_tmp = np.load(
        '/mnt/data/' + dataset_name + '/data_randgoal_08_50_08_batch.npy', allow_pickle=True).tolist()
    dataset = D4RLDataset(env, input_dataset=dataset_tmp)

    # ---------------------------------- predict reward -------------------------------------
    reward_model = initialize_model(FLAGS.env_name, index)
    print('\n', 'model type: ', FLAGS.model_type, '\n')
    dataset = reward_from_preference_transformer(
        FLAGS.env_name,
        dataset,
        reward_model,
        batch_size=FLAGS.batch_size,
        seq_len=FLAGS.seq_len,
        use_diff=FLAGS.use_diff,
        label_mode=FLAGS.label_mode
    )
    del reward_model

    normalize(dataset, FLAGS.env_name, max_episode_steps=env._max_episode_steps)
    
    return env, dataset


def initialize_model(env_name, index):
    model_path = './saved_model/' + f"model_{env_name}_iter_{str(index)}.pkl"
    with open(model_path, "rb") as f:
        ckpt = pickle.load(f)
    print('reward model loaded...', model_path)
    reward_model = ckpt['reward_model']
    return reward_model


def main(_):
    discount = False
    index = 5
    discount_number = 0.7
    __k = 300
    save_dir = f"runs/{FLAGS.env_name}_index_{str(index)}_k_{str(__k)}_discount_{str(discount)}_{str(discount_number)}_seed_{str(FLAGS.seed)}"
    
    summary_writer = SummaryWriter(save_dir, write_to_disk=True)
    os.makedirs(FLAGS.save_dir, exist_ok=True)
    
    env, dataset = make_env_and_dataset(FLAGS.env_name, FLAGS.seed, index)
    
    mask_conditioner = np.ones_like(dataset.masks) * discount_number/0.99
    print('mask shape: ', mask_conditioner.shape)
    returns = []
    for i in range(1000):
        returns.append(np.sum(dataset.rewards[i*500:(i+1)*500]))
    print('traj return: ', np.array(returns).shape)
    idx = np.argpartition(returns, -700)[-700:]
    for i in idx:
        mask_conditioner[i*500:(i+1)*500] = 1
    dataset.item = mask_conditioner
    
    if discount == False:
        mask_conditioner = np.ones_like(dataset.masks)
    else:
        pass
    dataset.masks = dataset.masks * mask_conditioner
    
    kwargs = dict(FLAGS.config)
    
    agent = Learner(FLAGS.seed,
                    env.observation_space.sample()[np.newaxis],
                    env.action_space.sample()[np.newaxis],
                    max_steps=FLAGS.max_steps,
                    **kwargs)

    eval_returns = []
    for i in tqdm(range(1, FLAGS.max_steps + 1), smoothing=0.1, disable=not FLAGS.tqdm):
        batch = dataset.sample(FLAGS.batch_size)
        update_info = agent.update(batch)

        if i % FLAGS.eval_interval == 0:
            print(FLAGS.env_name, '='*10)
            eval_stats = evaluate(agent, env, FLAGS.eval_episodes)
            
            for k, v in eval_stats.items():
                summary_writer.add_scalar(f'evaluation/average_{k}s', v, i)
            summary_writer.flush()

            eval_returns.append((i, eval_stats['return']))

            # mask1, mask07 = [], []
            # for kk in range(len(batch.masks)):
            #     update_q1 = update_info['__q1'][kk]
            #     update_q2 = update_info['__q2'][kk]
            #     batch_mask = batch.masks[kk]
            #     variance = np.var([update_q1, update_q2])
            #     if batch_mask == 1:
            #         mask1.append(variance)
            #     else:
            #         mask07.append(variance)
            # print(np.mean(np.array(mask1)), np.mean(np.array(mask07)))

if __name__ == '__main__':
    os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'
    app.run(main)
